#delimit;
** REPLACE FILE PATH WITH PATH TO RELEVANT REPLICATION FILES;
local fileloc = "~/KMS_REPLICATION";
set more off;
set logtype text;
capture log close fakepollution;

log using `fileloc'/log_files/fakepollution.txt, name(fakepollution) replace;
pause on;

global graph_options "graphregion(fcolor(white) color(white) icolor(white)) plotregion()";


** Note: this do-file builds the data, with generated errors, used for the Monte Carlo analysis;

***************************************************************************
******************IMPORT POLLUTION DATA (DAILY LEVEL)**********************
**************************************************************************;

clear all;

** Data from CARB website using DVD files (http://www.arb.ca.gov/aqd/aqdcd/aqdcddld.htm);

** All notes and data cleaning process follow that in the main dataset construction do-file titled "importing_daily_pollution". See said do-file for extensive notes;

***************START WITH CO, NOx, AND 03 (TRUE DAILY)********************;

** Variable names come from dailygas_csv_headers.txt;

insheet using `fileloc'/data/emissions_data/dlygas_csv.txt, clear comma nonames;

rename v2 location;
rename v3 date;
rename v9 comax8o;
rename v33 ozmax8o;

keep location date comax8o ozmax8o;
rename comax8o comax8hr;
rename ozmax8o ozmax8hr;

gen date2 = date(date, "MDY");
drop date;
rename date2 date;
format date %td;

keep if date >= td(1jan2002) & date <= td(1jan2008);

sort location date;

tempfile coandoz;
save `coandoz';

*************NOW DO PM10 and PM2.5, WHICH ARE EVERY 6 DAYS****************;

** Data from PM Mass Comma-delimited file (daily, hourly);

insheet using `fileloc'/data/emissions_data/PM10StdDaily20100120.txt, clear comma names;

gen date2 = date(date, "MDY");
drop date;
rename date2 date;
duplicates report site date;
keep if monitor == 1;
** Should now be no duplicates;
duplicates report site date;

** Keep only most common collection method;
keep if collectionmethod == "HVS09";

rename value pm10;

sort site date;

tempfile pm10;
save `pm10';

format date %td;

keep if date >= td(1jan2002) & date <= td(1jan2008);

** Rename site to location to match CO and O3 data;
rename site location;

**************** PUTTING ALL POLLUTANTS TOGETHER *******************;

tostring location, replace;
sort location date;
merge location date using `coandoz';
tab _merge;
drop _merge;

*************** ADDING LOCATION DATA FOR ZIP CALCS ****************;
sort location;
tempfile pollution;
save `pollution';

use `fileloc'/data/location_data/pollution_locations.dta, clear;
joinby location using `pollution';

keep location date pm10 ozmax8hr comax8hr;

** Unlikely  "0" readings are ever actually correct (especially given  surrounding values are not zeros) – turn all 0s to missing;
replace comax8hr = . if comax8hr == 0;
replace ozmax8hr = . if ozmax8hr == 0;
replace pm10 = . if pm10 == 0;

sort location date;

destring location, g(nostring) force;
drop if nostring == .;
drop nostring;

gen week = wofd(date);
format week %tw;
gen year = year(date);

collapse weekly_co = comax8hr weekly_oz = ozmax8hr weekly_pm10 = pm10 year, by(location week) fast;

order location week weekly_co weekly_oz weekly_pm10;
sort location week;

** Generate balanced panel for KMS time;

** CREATE BALANCED PANEL ;
keep if year >= 2002 & year <= 2007;

** Restrict to those present in all years;
foreach pollutant in weekly_pm10 weekly_co weekly_oz {;
	forvalues year = 2002/2007 {;
		gen `pollutant'_`year' = (`pollutant' ~= . & year == `year');
		egen `pollutant'_`year'_around = max(`pollutant'_`year'), by(location);		
		drop `pollutant'_`year';
	};
	replace `pollutant' = . if `pollutant'_2002_around == 0 | `pollutant'_2003_around == 0 | `pollutant'_2004_around == 0 | `pollutant'_2005_around == 0 | `pollutant'_2006_around == 0 | `pollutant'_2007_around == 0 ;

	drop *_around;

};

keep if weekly_co ~= . | weekly_oz ~= . | weekly_pm10 ~= .;

save `fileloc'/data/fakepollution.dta, replace;

***************************************************************************
****** CONSTRUCT MASTER FILE OF ALL DISTANCES EACH POLLUTION LOCATION *****
**************************************************************************;

use `fileloc'/data/location_data/pollution_locations.dta, clear;

destring location, g(nostring) force;
drop if nostring == .;
drop nostring;

gsort location, g(sort_location);
sum sort_location;
local maxlocation = r(max);

preserve;

clear;

tempfile pollution_to_pollution;
save `pollution_to_pollution', emptyok;

** Calculate distance between pollution monitors – we use this to assign pollution with error using all monitors within 20 miles;
quietly {;
	forvalues zip = 1/`maxlocation' {;
		restore;
		preserve;
		keep if sort_location == `zip';
		noisily di `maxlocation' - `zip';
		local lat = pollution_lat;
		local longi = pollution_long;
		local location = location;		
		use `fileloc'/data/location_data/pollution_locations.dta, clear;
		keep location pollution_lat pollution_long elevation;
		gen lat = `lat';
		gen longi = `longi';
		gen pollution_location = `location';
		qui vincenty lat longi pollution_lat pollution_long, hav(distance);
		keep location pollution_location distance;
		append using `pollution_to_pollution';
		save `pollution_to_pollution', replace;
	};
};

restore, not;

save `fileloc'/data/pollution_to_pollution_distance.dta, replace;

***************************************************************************
**NOW CALCULATE WEIGHTED POLLUTION VALUE USING ALL SENSORS WITHIN 20 MILES
***************************************************************************;

****;
****;
** GENERATE "FAKE" POLLUTION USING ALL OTHER MONITORS WITHIN 20 MILES;
****;
****;

use `fileloc'/data/fakepollution.dta, clear;
destring location, replace;
keep location weekly_pm10 week;

tempfile pol;
save `pol';

keep location week weekly_pm10;
rename weekly_pm10 weekly_pm10_real;
sort location week;
tempfile havedata;
save `havedata';

use `fileloc'/data/pollution_to_pollution_distance.dta, clear;
keep if distance <= 20;

** Drop own estimate;
drop if distance == 0;
destring location, replace force;
** Drop locations with string names – inconsistent information;
drop if location == .;

joinby location using `pol';

tempfile scatterfake;
save `scatterfake';

** Generate "fake" measure;
collapse weekly_pm10 (min) min_dist = distance [aw = 1/distance], by(pollution_location week) fast;
rename weekly_pm10 weekly_pm10_fake;
rename pollution_location location;

** Merge back to real data;
sort location week;
merge location week using `havedata';

tab _merge;
keep if _merge == 3;

****;
** FIND DIFFERENCE BETWEEN TRUE POLLUTION LEVEL AND PREDICTED ABOVE;
****;


** Generate month and year indicator;
gen month = month(dofw(week));
gen year = year(dofw(week));
egen monthXyear = group(month year);

** Generate difference between truth and estimated;
gen diff_raw = weekly_pm10_real - weekly_pm10_fake;
sum weekly_pm10_real, d;
local basemean = r(mean);
local twosd = r(mean) + 2*r(sd);

** Drop those with "missing" for the difference (either true or fake pollution is missing) for common sample with predicated bias;
drop if diff_raw == .;

replace min_dist = round(min_dist,0.5);

xi i.location;

** Remove location and month FEs;
areg diff _Iloc*, absorb(monthXyear);
predict resid, resid;
gen diff_minus_FE = resid;
drop resid;

xi i.location i.monthXyear;

** Use pollution-week database;
** Construct eps-hat_it from the "with fixed effects" model.;
reg weekly_pm10_real _I*;
predict pm10_resid, resid;

** Estimate OLS model to predict esp_hat_it
** LHS = eps-hat_it
** RHS has spline in "distance to nearest";
** Generate spline for distance;
mkspline distspline1 3 distspline2 6 distspline3 9 distspline4 12 distspline5 15 distspline6 = min_dist;
** RHS has spline in "actual pollution reading";
mkspline pmspline1 150 pmspline2 300 pmspline3 450 pmspline4 600 pmspline5 750 pmspline6 = weekly_pm10_real;
** RHS has an AR(1) component - lagged LHS, ie eps_hat_i(t-1);
gsort location, g(nonstring_location);
tsset nonstring_location week;
gen lagged_pm10 = L1.weekly_pm10_real;
gen lagged_resid = L1.diff_minus_FE;
** RHS doesn't need FEs: by construction are all zero;

reg diff_minus_FE lagged_resid pmspline* distspline*;
** Save coefficients for later predicted values;
preserve;
parmest, saving(`fileloc'/simulation/error_spline_coefs.dta, replace);
restore;

** Predict "predicted bias";
predict pred_bias;

** Create "squared unpredictable error" = SUE = (eps-hat_it - m-hat_it)^2;
gen SUE = (diff_minus_FE - pred_bias)^2;
tsset nonstring_location week;
gen lagged_SUE = L1.SUE;

save `fileloc'/simulation/togenerate_bias_inreal_data.dta, replace;


***************************************************************************
** FIGURE 6, PANELS A AND B
***************************************************************************;
** Graph error against minimum monitor distance and true PM10 level;
preserve;
	collapse weekly_pm10_resid = diff_minus_FE (sd) weekly_pm10_resid_sd = diff_minus_FE, by(min_dist) fast;

	twoway scatter weekly_pm10_resid min_dist,
		xtitle("Minimum Distance to Closest Monitor")
		ytitle("Error")
		$graph_options;
	graph export `fileloc'/graphs/weekly_pm10_resids_distance.eps, replace;	

restore;
preserve;
	collapse weekly_pm10_resid = diff_minus_FE (sd) weekly_pm10_resid_sd = diff_minus_FE, by(weekly_pm10_real) fast;

	twoway scatter weekly_pm10_resid weekly_pm10_real ,
		xtitle("True Pollution Reading")
		ytitle("Error")
		xline(`basemean', lcolor(red) lpattern(dash))
		xline(`twosd', lcolor(red) lpattern(dot))
		$graph_options;
	graph export `fileloc'/graphs//weekly_pm10_resids_truelevel.eps, replace;	
restore;

tsset nonstring_location week;

** Regress SUE on similar model to 1.2;
reg SUE lagged_SUE distspline* pmspline*;

** Save the coefficients from this model;
preserve;
parmest, saving(`fileloc'/simulation/squared_error_spline_coefs.dta, replace);
restore;

log close fakepollution;